import scrapy

from duanzi01.items import Duanzi01Item
# 自定义spider类，继承scrapy.spider
class DuanziSpider(scrapy.Spider):
    # 爬虫名字
    name = "duanzi"
    # 允许爬取的范围，防止爬虫爬到别的网站
    allowed_domains = ["duanzixing.com"]
    # 开始爬取的url地址
    start_urls = ["http://duanzixing.com/"]

    # 数据提取的方法，接受下载中间件传过来的response 是重写父类中的parse方法
    def parse(self, response):
        # 实例化
        item = Duanzi01Item()
        # 打印抓取到的页面源码
        # print(response.text)
        article_list = response.xpath('//article[@class="excerpt"]')
        for article in article_list:
            # 匹配标题
            # title = article.xpath('./header/h2/a/text()').extract() # extract 返回节点中的文本数据
            title = article.xpath('./header/h2/a/text()').extract_first()

            # 获取段子内容
            con = article.xpath('./p[@class="note"]/text()').extract_first()

            # dic = {
            #     "title": title,
            #     "con": con
            # }
            # print(dic)  # 通过scrapy crawl duanzi -o Duanzi.csv 导入到csv里面

            item['title'] = title
            item['con'] = con
            # yield dic
            yield item
